Libraries

library(ggplot2)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
library(fpc)
library(dbscan)
## 
## Attaching package: 'dbscan'
## The following object is masked from 'package:fpc':
## 
##     dbscan
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)

2.1 - K means Clustering

A - Read and Prepare Data

i: Omit attribute

set.seed(1122)
# We should remove the labels as clustering is an unsupervised algorithm.
#Get rid of "Name"

paste("We should remove the labels as clustering is an unsupervised algorithm. Get rid of 'Name'")
## [1] "We should remove the labels as clustering is an unsupervised algorithm. Get rid of 'Name'"

ii: Standardize?

paste("The data is between 0 and 5. We can standardize this to to get a better understanding of the data, where mean=0, var=1. Will do this after getting the dataset.")
## [1] "The data is between 0 and 5. We can standardize this to to get a better understanding of the data, where mean=0, var=1. Will do this after getting the dataset."

Clean up the TXT file

Also normalize it

# Here I will manually clean the 
data2 <- read.table(file="file19.txt", header = FALSE, sep = "\t", dec = ".")
df <- data2[-(1:3),]
dat_frame <- data.frame(matrix(vector(),ncol=8))
for (i in 2:67){
  str <- str_split(df[i], "")
  #17 19 21 23 25 27 29 31 for attributes
  rowi <- c(strtoi(str[[1]][17]),strtoi(str[[1]][19]),strtoi(str[[1]][21]),strtoi(str[[1]][23]),strtoi(str[[1]][25]),strtoi(str[[1]][27]),strtoi(str[[1]][29]),strtoi(str[[1]][31]))
  dat_frame <- rbind(dat_frame, rowi)
}
colnames(dat_frame) <- c("I", "i", "C", "c","P", "p", "M", "m")

#Normalize the data
df <- as.data.frame(scale(dat_frame))

Save the data

write.csv(df, file="file19saved.csv", row.names=F)
df <- read.csv("file19saved.csv", header=T, sep=",")
df <- as.data.frame(df)
head(df)
##           I          i         C          c          P         p         M
## 1 2.6808138  1.3970672 0.8257228  0.9059288  0.3610716 0.4549113 1.3254890
## 2 1.0037170  0.5353248 0.8257228  0.9059288  1.1828206 1.2450204 0.4678196
## 3 1.0037170 -0.3264176 0.8257228 -1.0871146  0.3610716 0.4549113 0.4678196
## 4 1.0037170  0.5353248 0.8257228  0.9059288  1.1828206 1.2450204 0.4678196
## 5 0.1651686  0.5353248 0.8257228  0.9059288  0.3610716 0.4549113 0.4678196
## 6 0.1651686  0.5353248 0.8257228  0.9059288 -0.4606775 0.4549113 0.4678196
##           m
## 1 1.3404041
## 2 0.3574411
## 3 0.3574411
## 4 0.3574411
## 5 0.3574411
## 6 0.3574411

B - Clustering

fviz_nbclust(df, kmeans, method = "silhouette")

#fviz_nbclust(df, kmeans, method = "wss")
k <- kmeans(df, centers=8)
paste("looking at silhoutte graph gives 8 clusters.")
## [1] "looking at silhoutte graph gives 8 clusters."
fviz_cluster(k, df, main="K-means Cluster with k=8")

k$cluster
##  [1] 7 7 7 7 1 1 1 1 1 1 1 8 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 7 7 7 7 4 7 4
## [39] 4 2 2 2 4 2 2 4 2 2 2 2 2 4 4 5 2 5 1 6 6 6 6 6 6 6 6 6
paste("There are this many points in each clusters:  1: 8,  2: 11,  3: 19,  4: 7,  5: 2,  6: 9,  7: 9,  8: 1")
## [1] "There are this many points in each clusters:  1: 8,  2: 11,  3: 19,  4: 7,  5: 2,  6: 9,  7: 9,  8: 1"
paste("total SSE-WSS of the clusters:")
## [1] "total SSE-WSS of the clusters:"
k$tot.withinss
## [1] 55.12383
paste("SSEs for each clusters")
## [1] "SSEs for each clusters"
k$withinss
## [1]  4.244696  5.932404 15.568186  3.606768  2.223560  6.337449 17.210772
## [8]  0.000000
paste("SSE in each clusters:  1: 4,  2: 6,  3: 16,  4: 4,  5: 2,  6: 6,  7: 17,  8: 0")
## [1] "SSE in each clusters:  1: 4,  2: 6,  3: 16,  4: 4,  5: 2,  6: 6,  7: 17,  8: 0"
which(k$cluster==1)
## [1]  5  6  7  8  9 10 11 57
paste("all of these are mostly bat types of animals")
## [1] "all of these are mostly bat types of animals"
which(k$cluster==2)
##  [1] 40 41 42 44 45 47 48 49 50 51 55
paste("This cluster is cut into two with  mole-type animals and cougar type animals.")
## [1] "This cluster is cut into two with  mole-type animals and cougar type animals."
which(k$cluster==3)
##  [1] 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
paste("mostly weasel-squirrel type of animals. However there are a lot of variety. This cluster needs improvement.")
## [1] "mostly weasel-squirrel type of animals. However there are a lot of variety. This cluster needs improvement."
which(k$cluster==4)
## [1] 36 38 39 43 46 52 53
paste("these are wild-cat like animals. Good job in clustering")
## [1] "these are wild-cat like animals. Good job in clustering"
which(k$cluster==5)
## [1] 54 56
paste("these are walrus and elephant seal animals. The cluster is too small.")
## [1] "these are walrus and elephant seal animals. The cluster is too small."
which(k$cluster==6)
## [1] 58 59 60 61 62 63 64 65 66
paste("This cluster is mostly elk type animals. Generally good.")
## [1] "This cluster is mostly elk type animals. Generally good."
which(k$cluster==7)
## [1]  1  2  3  4 32 33 34 35 37
paste("These are wild-cats and moles. Cluster should be divided into two.")
## [1] "These are wild-cats and moles. Cluster should be divided into two."
which(k$cluster==8)
## [1] 12
paste("this is an armadillo, it is natural that this was quite different than the others.")
## [1] "this is an armadillo, it is natural that this was quite different than the others."
paste("Overall, the clustering is good. Some clusters needs improvement, we can do it maybe by increasing the amount of clusters.")
## [1] "Overall, the clustering is good. Some clusters needs improvement, we can do it maybe by increasing the amount of clusters."

2.2 DBSCAN

Retrieving the data

df2 <- read.csv("s1.csv", header=T, sep=",")
head(df2)
##        x      y
## 1 664159 550946
## 2 665845 557965
## 3 597173 575538
## 4 618600 551446
## 5 635690 608046
## 6 588100 557588

A - Normalize the data

colMeans(df2)
##        x        y 
## 514937.6 494709.3
paste("We have to normalize the data since the numbers are too big to make sense of. Normalizing will allow the mean to be 0, and the variance to be 1.")
## [1] "We have to normalize the data since the numbers are too big to make sense of. Normalizing will allow the mean to be 0, and the variance to be 1."
df2 <- as.data.frame(scale(df2))
head(df2)
##           x         y
## 1 0.6103978 0.2384519
## 2 0.6172944 0.2682135
## 3 0.3363882 0.3427256
## 4 0.4240364 0.2405720
## 5 0.4939439 0.4805644
## 6 0.2992746 0.2666150
colMeans(df2)
##             x             y 
## -4.381495e-17  1.983969e-17

B - Plot the dataset & observe

plot(df2, main="Data without clustering")

paste("I can see 15 clusters in here, which are well separated.")
## [1] "I can see 15 clusters in here, which are well separated."

C - K-means

fviz_nbclust(df2, kmeans, method = "wss", k.max=18)

fviz_nbclust(df2, kmeans, method = "silhouette", k.max=18)

k2 <- kmeans(df2, centers=14)
fviz_cluster(k2, df2, main="K-means with centers=14")

paste("Problem with 4 clusters.")
## [1] "Problem with 4 clusters."
k2 <- kmeans(df2, centers=15)
fviz_cluster(k2, df2, main="K-means with centers=15")

paste("Problem with 4 clusters.")
## [1] "Problem with 4 clusters."
k2 <- kmeans(df2, centers=12)
fviz_cluster(k2, df2, main="K-means with centers=12")

paste("Problem with 6 clusters.")
## [1] "Problem with 6 clusters."
k2 <- kmeans(df2, centers=13)
fviz_cluster(k2, df2, main="K-means with centers=13")

paste("Problem with 2 clusters.")
## [1] "Problem with 2 clusters."
paste("Therefore, I will select 13 clusters for k-means. The clustering is generally better more consistently .")
## [1] "Therefore, I will select 13 clusters for k-means. The clustering is generally better more consistently ."

E - perform DBSCAN

i: MinPts

paste("some clusters are quite close to each other, and the data points are close to each other in each cluster too.")
## [1] "some clusters are quite close to each other, and the data points are close to each other in each cluster too."
k2$withinss
##  [1]  12.196766  10.691121 115.641971 123.234195   8.158955   5.003423
##  [7]   9.867617  10.778042  27.777577  12.654594  93.374137   3.258654
## [13]  11.028566
k2$size
##  [1] 345 338 624 686 329 216 351 297 399 340 633 125 317
paste("Since we have 2-D data, We can set it to 4. UPDATE: after grid searching with different minPts and epsses, I decided to set it to 6. Considering our data is condensed, we can do this.")
## [1] "Since we have 2-D data, We can set it to 4. UPDATE: after grid searching with different minPts and epsses, I decided to set it to 6. Considering our data is condensed, we can do this."

ii: eps

K <- 6

dbscan::kNNdistplot(df2, K)
paste("I can see that the eps can be 0.08")
## [1] "I can see that the eps can be 0.08"
abline(h = 0.08, lty = 2)

Cluster

db <- fpc::dbscan(df2, eps =0.09, MinPts = K)
fviz_cluster(db, df2, geom = "point", main = "Cluster Plot minPts=5, eps=0.09")

print(db)
## dbscan Pts=5000 MinPts=6 eps=0.09
##          0   1   2 3   4   5   6   7   8   9  10  11  12  13  14  15
## border 140  15  13 5   1  13   4  11   8  12  10  17  15   4   9  12
## seed     0 264 310 1 309 323 304 313 313 648 318 309 310 342 333 314
## total  140 279 323 6 310 336 308 324 321 660 328 326 325 346 342 326
db <- fpc::dbscan(df2, eps =0.085, MinPts = K)
fviz_cluster(db, df2, geom = "point", main = "Cluster Plot minPts=5, eps=0.085")

print(db)
## dbscan Pts=5000 MinPts=6 eps=0.085
##          0   1   2 3   4   5   6   7   8   9  10  11  12  13  14  15
## border 163  14  13 5   2  16   6  12   9  13  11  17  14   7  11  13
## seed     0 262 308 1 308 318 302 307 311 645 316 308 306 339 330 313
## total  163 276 321 6 310 334 308 319 320 658 327 325 320 346 341 326
db <- fpc::dbscan(df2, eps =0.082, MinPts = K)
fviz_cluster(db, df2, geom = "point", main = "Cluster Plot minPts=5, eps=0.082")

print(db)
## dbscan Pts=5000 MinPts=6 eps=0.082
##          0   1   2 3   4   5   6   7   8   9  10  11  12  13  14  15  16
## border 188  14  13 5   2  17   8  15   8   5  13  12  18  16   7  11   6
## seed     0 259 308 1 308 317 300 304 308 321 319 313 306 302 339 328 309
## total  188 273 321 6 310 334 308 319 316 326 332 325 324 318 346 339 315
db <- fpc::dbscan(df2, eps =0.08, MinPts = K)
fviz_cluster(db, df2, geom = "point", main = "Cluster Plot minPts=5, eps=0.08")

print(db)
## dbscan Pts=5000 MinPts=6 eps=0.08
##          0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
## border 208  14  14   2  18  11  18   8   6  14  14  16  16   7  11   6
## seed     0 259 307 308 316 297 301 308 319 318 310 304 296 338 327 309
## total  208 273 321 310 334 308 319 316 325 332 324 320 312 345 338 315
db <- fpc::dbscan(df2, eps =0.078, MinPts = K)
fviz_cluster(db, df2, geom = "point", main = "Cluster Plot minPts=5, eps=0.082")

print(db)
## dbscan Pts=5000 MinPts=6 eps=0.078
##          0   1   2   3   4   5   6   7   8   9  10 11  12  13  14  15 16  17
## border 219  18  16   2  16  11  21  11  10  17  13  4  15  19   7  12  4   8
## seed     0 255 304 308 311 296 297 305 315 314 303  4 302 292 337 325  2 307
## total  219 273 320 310 327 307 318 316 325 331 316  8 317 311 344 337  6 315
paste("Best result is from eps=0.8 with 15 clusters that make sense.")
## [1] "Best result is from eps=0.8 with 15 clusters that make sense."

DBScan Result

paste("At minPts = 4, I tried different epsses. Best eps = 0.08, there are 20 clusters. At minPts = 5, I tried different epsses. Best eps = 0.08, there are 17 clusters. At minPts = 6, I tried different epsses. Best eps = 0.08, there are 15 clusters.")
## [1] "At minPts = 4, I tried different epsses. Best eps = 0.08, there are 20 clusters. At minPts = 5, I tried different epsses. Best eps = 0.08, there are 17 clusters. At minPts = 6, I tried different epsses. Best eps = 0.08, there are 15 clusters."
paste("Overall, best result was from when minPts=5, eps=0.08. There was less data loss, and the clusters made sense.")
## [1] "Overall, best result was from when minPts=5, eps=0.08. There was less data loss, and the clusters made sense."